In [ ]:
from konlpy.tag import Kkma # 꼬꼬마 형태소 분석기 사용
kkma = Kkma()
text = "오늘 서울의 날씨는 추워질 전망입니다. 오후 한때 소나기가 올 예정입니다. 아, 오늘은 좀 힘드네요...이런? 난 도대체 뭐지?! 뭐랄까? 뭐라는거니"
sentences = kkma.sentences(text)
for sentence in sentences:
print(sentence)
In [ ]:
kkma.nouns(text)
In [ ]:
pos_tag = kkma.pos(text)
print(pos_tag[:5])
In [ ]:
# NLTK처럼 내장 데이터를 불러올 수도 있음
from konlpy.corpus import kolaw
fids = kolaw.fileids()
fids
In [ ]:
# 실제 data 사용해서 분석해보기 - 인코딩 문제
with open('pgh-2015.txt', 'r') as f:
lines = f.read().splitlines()
print(lines[:5])
In [ ]:
#'cp949' codec can't decode byte 0xed in position 6: illegal multibyte sequence 에러인 경우 운영체제의 default 인코딩과 utf-8과 달라서 발생
with open('pgh-2015.txt', 'r', encoding='utf-8') as f:
lines = f.read().splitlines()
print(lines[:5])
In [ ]:
# 빈 문장 '' 제거
sentences = [line for line in lines if line != '']
for line in lines[:5]:
if line != '':
print(line)
In [ ]:
# komoran을 이용한 형태소 분석
from konlpy.tag import Komoran
tagger = Komoran()
tags = tagger.pos(sentences[0])
print(tags[:4])
In [ ]:
tagged_sentences = [tagger.pos(sent) for sent in sentences]
tagged_sentences[0]
In [ ]:
# 명사 리스트 만들어 보기
noun_list = []
for sent in tagged_sentences:
for word, tag in sent:
if tag in ['NNP', 'NNG']:
noun_list.append(word)
noun_list[:10]
In [ ]:
# collection library를 이용하여 빈도수 계산하기
from collections import Counter
noun_counts = Counter(noun_list)
noun_counts.most_common(20)
In [ ]:
noun_list = []
stop_words = ['경제',"청년"]
for sent in tagged_sentences:
for word, tag in sent:
if tag in ['NNP', 'NNG']:
if word not in stop_words:
noun_list.append(word)
#collecnoun_list
In [ ]:
import collections
noun_counts = collections.Counter(noun_list)
noun_counts.most_common(10) # '청년'과 '개혁'이 없어짐을 확인할 수 있음
In [ ]:
import nltk
import matplotlib.pyplot as plt # 결과를 시각화 하기 위한 matplotlib
%matplotlib inline
# word index 대신 word를 보여주는 그래프
freqdist = nltk.FreqDist(noun_counts)
freqdist.plot(50)
freqdist.plot(50,cumulative=True)
In [ ]:
# 폰트 해결
from matplotlib import font_manager, rc
font_fname = r'C:\Windows\Fonts\NGULIM.TTF' # A font of your choice
font_name = font_manager.FontProperties(fname=font_fname).get_name()
rc('font', family=font_name)
In [ ]:
freqdist.plot(50)
freqdist.plot(50,cumulative=True)
In [ ]:
# unique한 명사 리스트 만들기
unique_nouns = set()
unique_list = []
for sent in tagged_sentences:
for word, tag in sent:
if tag in ['NNP','NNG']:
if word not in unique_list:
unique_list.append(word)
for sent in tagged_sentences:
for word, tag in sent:
if tag in ['NNP', 'NNG']:
unique_nouns.add(word)
unique_nouns = list(unique_nouns)
noun_index = {noun: i for i, noun in enumerate(unique_nouns)} # 딕셔너리 형태의 자료구조
noun_index
In [ ]:
import numpy as np
# 문장 길이 X 명사 종류 matrix 생성
occurs = np.zeros([len(tagged_sentences), len(unique_nouns)])
np.shape(occurs)
In [ ]:
for i, sent in enumerate(tagged_sentences):
for word, tag in sent:
if tag in ['NNP', 'NNG']:
index = noun_index[word] # 명사가 있으면, 그 명사의 인덱스를 index에 저정
occurs[i][index] = 1 # 문장 i의 index 자리에 1을 채워 넣는다.
occurs[0]
In [ ]:
# 공존 단어 행렬 계산
# i 번째 단어
co_occurs = occurs.T.dot(occurs)
In [ ]:
for i in range(100):
for j in range(100):
if (co_occurs[i][j] > 1) & (i>j):
print(unique_nouns[i], unique_nouns[j], co_occurs[i][j])
In [ ]:
import networkx as nx
graph = nx.Graph()
for i in range(len(unique_nouns)):
for j in range(i + 1, len(unique_nouns)):
if co_occurs[i][j] > 4:
graph.add_edge(unique_nouns[i], unique_nouns[j])
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
krfont = {'family' : 'nanumgothic', 'weight' : 'bold', 'size' : 10}
plt.rc('font',**krfont)
In [ ]:
plt.figure(figsize=(15, 15))
layout = nx.spring_layout(graph, k=.1)
nx.draw(graph, pos=layout, with_labels=True,
font_size=20, font_family='Comic Sans MS',
alpha=0.3, node_size=3000)
plt.show()